####################################################
### Bioweapons Embeddings Summaries: Sbert #########
### Script: 01_embed_sbert_v2.py
### Purpose: This code runs the English summaries of the bioweapons
## articles through the sbert embeddings (stsb-roberta-large and
## MPNet Base V2)
### Data In:
## 1) Bioweapons case study articles
## data/bioweapons_casestudy_5_20_2024_public.csv
### Data Out:
## 1) bioweapons articles (same file), with additional sbert embedding vector
## data/bioweapons_casestudy_5_20_2024_sbert_embeddings.json
## Same structure as read in file, but includes columns
## "embeddings_base" and "embeddings_roberta", 
## which are numeric vector with sbert embeddings 
## for each "summary" field

from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import pandas as pd

# Load a pretrained Sentence Transformer model
## we try two sbert models 
model_roberta = SentenceTransformer("stsb-roberta-large")
model_base = SentenceTransformer("all-mpnet-base-v2")

## Set working directory to Dataverse replication folder
## read in articles
art = pd.read_csv("data/bioweapons_casestudy_5_20_2024_public.csv")

## store as list
embeddings_to_get = []
for item in tqdm(art.to_dict(orient='records')):
    embeddings_to_get.append(item)

# create df of sentences to get embeddings for
bby_df = pd.DataFrame(embeddings_to_get)

# get list of embeddings - roberta 
embeddings_roberta = list(model_roberta.encode(bby_df['summary'].tolist()))
bby_df['embeddings_roberta'] = embeddings_roberta

## get list of embeddings - base model 
embeddings_base = list(model_base.encode(bby_df['summary'].tolist()))
bby_df['embeddings_base'] = embeddings_base

#with open('data/bioweapons_casestudy_5_20_2024_sbert_embeddings.json', 'w') as op:
#    op.write(bby_df.to_json(orient='records', lines=True))
